library(tidyverse)
library(rlang)
library(quanteda)
library(readr)
library(readxl)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

#Load Twitter and Facebook data. These data sets cannot be shared publicly.
load("MPs FB Twitter/MP_fbposts.Rda")
load("MPs FB Twitter/MP_tweets.Rda")

#load candidate data including Bundeswahlleiter data
load("Data/raw_data.Rda")


#---------------------------------------------------------#
## ---CONSTITUENCY-LEVEL DATA (only for list-only MPs) ####
#---------------------------------------------------------#

# get the constituency level data
constituency_data <- df %>%
  dplyr::select("constituency", "population_density", "household_income", "universityentrancepercent", 
         "youngdemographypercent") %>%
  unique() %>%
  drop_na()


df <- df %>%
  mutate(
    # manually add constituency for list-only MPs (according to residence)
    unique_id = as.numeric(unique_id),
    constituency = as.numeric(constituency),
    constituency = case_when(!is.na(constituency) ~ constituency,
                             
                             unique_id == 143 ~ 18,
                             unique_id == 381 ~ 37,
                             unique_id == 445 ~ 54 ,
                             unique_id == 549 ~ 75,
                             unique_id == 842 ~ 100,
                             
                             unique_id == 852 ~ 144,
                             unique_id == 1073 ~ 173,
                             unique_id == 1534 ~ 267,
                             unique_id == 1787 ~ 231,
                             unique_id == 1982 ~ 296,
                             
                             unique_id == 1991 ~ 296,
                             unique_id == 2068 ~ 78,
                             unique_id == 2080 ~ 79,
                             unique_id == 2081 ~ 75,
                             unique_id == 2082 ~ 79,
                             
                             unique_id == 2176 ~ 61,
                             unique_id == 2178 ~ 64,
                             unique_id == 2179 ~ 62,
                             unique_id == 2352 ~ 76,
                             unique_id == 2420 ~ 73,
                             
                             unique_id == 2507 ~ 192,
                             unique_id == 2509 ~ 196)) %>%
  # drop constituency-variables
  dplyr::select(-c("population_density", "household_income", "universityentrancepercent", 
            "youngdemographypercent")) %>%
  # merge constituency variables (also for list-only MPs now)
  merge(constituency_data, by.x = "constituency", by.y = "constituency")

#---------------------------------------------------------#
## ---ELECTORAL MARGINALITY DATA                       ####
#---------------------------------------------------------#

btw17_kerg <- read_delim("Data/btw17_kerg.csv", 
                         ";", escape_double = FALSE, trim_ws = TRUE, 
                         skip = 5)

btw17_kerg_new <- btw17_kerg %>% 
  subset(`gehört zu` != 99) %>%
  dplyr::select(c(1,2,4,8,12,16,20,24,28,32,36,40,44))

# make columns numeric
cols <- names(btw17_kerg_new )[7:13]
btw17_kerg_new [cols] <- lapply(btw17_kerg_new [cols], as.numeric)

#find max value
btw17_kerg_new[, "max_value"] <- apply(btw17_kerg_new[, 7:13], 1, max,na.rm=TRUE)

# create new variables
btw17_kerg_new <- btw17_kerg_new %>%
  as.data.frame() %>%
  mutate(CDU_share = as.numeric(`Christlich Demokratische Union Deutschlands`)/as.numeric(Gültige),
         SPD_share = as.numeric(`Sozialdemokratische Partei Deutschlands`)/as.numeric(Gültige),
         LINKE_share = as.numeric(`DIE LINKE`)/as.numeric(Gültige),
         GRÜNEN_share = as.numeric(`BÜNDNIS 90/DIE GRÜNEN`)/as.numeric(Gültige),
         CSU_share = as.numeric(`Christlich-Soziale Union in Bayern e.V.`)/as.numeric(Gültige),
         
         FDP_share = as.numeric(`Freie Demokratische Partei`)/as.numeric(Gültige),
         AfD_share = as.numeric(`Alternative für Deutschland`)/as.numeric(Gültige),
         max_value_share = as.numeric(max_value)/as.numeric(Gültige)) %>%
  
  mutate(CDU_marginality = abs(CDU_share-max_value_share),
         SPD_marginality = abs(SPD_share-max_value_share),
         LINKE_marginality = abs(LINKE_share-max_value_share),
         GRÜNEN_marginality = abs(GRÜNEN_share-max_value_share),
         CSU_marginality = abs(CSU_share-max_value_share),
         
         FDP_marginality = abs(FDP_share-max_value_share),
         AfD_marginality = abs(AfD_share-max_value_share))

# find second min value
btw17_kerg_new$second_min_value = apply(btw17_kerg_new[, 23:29], 1, function(x) (sort(x))[2])


# if winner (marginality=0), use vote share distance to second best party (second_min_value)
btw17_kerg_new$CDU  <- ifelse(btw17_kerg_new$CDU_marginality == 0, btw17_kerg_new$second_min_value,
                              btw17_kerg_new$CDU_marginality)
btw17_kerg_new$SPD <- ifelse(btw17_kerg_new$SPD_marginality == 0, btw17_kerg_new$second_min_value,
                             btw17_kerg_new$SPD_marginality)
btw17_kerg_new$'Left party'  <- ifelse(btw17_kerg_new$LINKE_marginality == 0, btw17_kerg_new$second_min_value,
                                btw17_kerg_new$LINKE_marginality)
btw17_kerg_new$Greens <- ifelse(btw17_kerg_new$GRÜNEN_marginality == 0, btw17_kerg_new$second_min_value,
                                btw17_kerg_new$GRÜNEN_marginality)
btw17_kerg_new$CSU <- ifelse(btw17_kerg_new$CSU_marginality == 0, btw17_kerg_new$second_min_value,
                             btw17_kerg_new$CSU_marginality)

btw17_kerg_new$FDP <- ifelse(btw17_kerg_new$FDP_marginality == 0, btw17_kerg_new$second_min_value,
                             btw17_kerg_new$FDP_marginality)
btw17_kerg_new$AfD  <- ifelse(btw17_kerg_new$AfD_marginality == 0, btw17_kerg_new$second_min_value,
                              btw17_kerg_new$AfD_marginality)


## select important variables
btw17_kerg_new <- btw17_kerg_new %>%
  dplyr::select(Nr, CDU, SPD, 'Left party', Greens, CSU, FDP, AfD) 

# bring in long format
electoral_marginality <- pivot_longer(btw17_kerg_new, cols =c(CDU, SPD, 'Left party', Greens, CSU, FDP, AfD),
                                      names_to = "party",
                                      values_to = "electoral_marginality") %>%
  dplyr::rename(constituency = Nr) %>%
  mutate(chance_to_win_district_05 = case_when(electoral_marginality < 0.05 ~ 1,
                                               electoral_marginality >= 0.05 ~ 0),
         chance_to_win_district_025 = case_when(electoral_marginality < 0.025 ~ 1,
                                                electoral_marginality >= 0.025 ~ 0),
         chance_to_win_district_10 = case_when(electoral_marginality < 0.1 ~ 1,
                                               electoral_marginality >= 0.1 ~ 0))

### merge
df <- df %>%
merge(electoral_marginality, by.x = c("constituency", "party"), 
      by.y = c("constituency", "party"))


#---------------------------------------------#
## --- ELECTORAL VIABILITY                 ####
#---------------------------------------------#

df <- df %>% 
  mutate(state = recode(state, 
                        `1` = "Schleswig-Holstein",
                        `2` = "Hamburg",
                        `3` = "Niedersachsen",
                        `4` = "Bremen",
                        `5` = "Nordrhein-Westfalen",
                        `6` = "Hessen",
                        `7` = "Rheinland-Pfalz",
                        `8` = "Baden-Württemberg",
                        `9` = "Bayern",
                        `10` = "Saarland",
                        `11` = "Berlin",
                        `12` = "Brandenburg",
                        `13` = "Mecklenburg-Vorpommern",
                        `14` = "Sachsen",
                        `15` = "Sachsen-Anhalt",
                        `16` =  "Thüringen"),
         party = recode(party, "CDU" = "CDU/CSU",
                        "CSU"= "CDU/CSU")) %>% 
  as_tibble()


# list/direct candidates per country/party
df_list_party <- df %>% 
  #filter(direct == 0) %>% 
  group_by(party) %>% 
  mutate(n_party = n_distinct(unique_id)) %>% 
  ungroup() %>% 
  group_by(party, state) %>% 
  summarise(n = n(),
            freq = n / mean(n_party)) %>% 
  ungroup()

# predictions from an average of major polls in 2017-Feb.2020
forecast <- read_excel("Data/forecasts.xlsx") %>% 
  as_tibble() %>% 
  dplyr::select(-Year) %>% 
  pivot_longer(-Date) %>% 
  dplyr::select(-Date) %>% 
  group_by(name) %>% 
  summarise(predicted_perc = mean(value)) %>% 
  rename(party = name) %>% 
  filter(party != "Sonstige") %>% 
  mutate(party = recode(party, "GRÜNE" = "Greens",
                        "LINKE" = "Left party"),
         perc_total = sum(predicted_perc)) %>% 
  group_by(party) %>% 
  mutate(predicted_perct = predicted_perc / perc_total,
         predicted_seats = predicted_perct*709)

# join 2017-20 Bundestag party-state seat distribution and survey forecasts
df_state_predictions <- df_list_party %>% 
  left_join(forecast[c("party", "predicted_seats")], by = "party") %>% 
  mutate(predicted_seats = freq * predicted_seats)

#1) To incorporate uncertainty, the standard deviation of discrepancy between the predictions and
#   the seats that were actually won was calculated for each state 
discrepancy <- df_state_predictions %>% 
  rowwise() %>% 
  mutate(discrepancy = predicted_seats-n) %>% 
  ungroup() %>% #TODO replace 2017 seats by 2021 seats
  group_by(state) %>% 
  mutate(sd_discrepancy = sd(discrepancy, na.rm = T)) %>% 
  rename(seats = n) 

# join forecasts and candidates
df <- df %>% 
  left_join(discrepancy %>% dplyr::select(state, party, seats, sd_discrepancy), by = c("state", "party"))

#2) Candidates with a list position below the predicted seats minus one standard deviation (rounded) 
#were classified as save. Candidates with a list position above the predicted
#seats plus one standard deviation (rounded) were classified as unpromising. All other candiates 
#were classified as doubtful.
df <- df %>% 
  mutate(viability = case_when(list_place < (seats - sd_discrepancy) ~ 
                                 "safe",
                               list_place > (seats + sd_discrepancy) ~ 
                                 "unpromising",
                               is.na(list_place) ~ NA_character_,
                               TRUE ~ "doubtful"),
         safelist = case_when(viability == "safe" ~ 1,
                                       viability != "safe" ~ 0,
                                       is.na(viability) ~ 0))

#Inspect
# test <- df %>% select(name, firstname, state, party, list_place, seats, sd_discrepancy, viability, direct)
# View(test)

#---------------------------------------------#
## --- REGIONALIZED WORDING                ####
#---------------------------------------------#

#* 1. create dictionary of regionalization ----

myDict <- dictionary(list(regionalization = 
                            c("*wahlkreis*", "örtlich*", 
                              "regional*", "lokal*", 
                              "hiesig*", "räumlich*", "*heimisch*" )))


#* 2. Facebook data ----

#build corpus
fb_short <- subset(fb, select = c(page_id, message))
corpFB <- corpus(fb_short, text_field = "message")

#build dtm 
dtmFB <- corpFB %>% 
  tokens(remove_punct = TRUE,
         remove_url = TRUE, #remove_twitter = TRUE, 
         remove_numbers = TRUE,
         split_hyphens = TRUE,
         remove_symbols = TRUE,
         include_docvars = TRUE) %>% 
  tokens_select(pattern = c(stopwords("de")), selection = "remove") %>% 
  dfm() %>% 
  dfm_group(groups = page_id) %>% 
  dfm_lookup(dictionary = myDict)
mentionsFB <- convert(dtmFB, to = "data.frame") %>% 
  rename(fb_regionalization = regionalization,
         page_id = doc_id)

#freq by account
fb_freq <- as.data.frame(table(fb$page_id))
fb_freq$fb_active <- 1
fb_freq <- fb_freq %>% rename(page_id = Var1,
                              fb_freq = Freq) 

#merge regionalization and fb_freq
fb_freq <- left_join(fb_freq, mentionsFB, by = "page_id")

#merge with df; two Facebook account columns, thus two separate merges
df <- merge(df, fb_freq, by.x = "fb_id1", by.y = "page_id" , all.x = TRUE)
df <- merge(df, fb_freq, by.x = "fb_id2", by.y = "page_id" , all.x = TRUE)

#combine the variables
df$fb_freq <- ifelse(is.na(df$fb_freq.x) , ifelse(is.na(df$fb_freq.y) , NA, df$fb_freq.y), df$fb_freq.x)
df$fb_active <- ifelse(is.na(df$fb_active.x) , ifelse(is.na(df$fb_active.y) , NA, df$fb_active.y), df$fb_active.x)
df$fb_regionalization <- ifelse(is.na(df$fb_regionalization.x) , ifelse(is.na(df$fb_regionalization.y) , NA, df$fb_regionalization.y), df$fb_regionalization.x)

#drop redundant variables
df <- dplyr::select(df, -c(fb_freq.x,fb_freq.y, fb_active.x, fb_active.y, 
                           fb_regionalization.x, fb_regionalization.y))


#* 3. Twitter data ----

#create corpus
tw_short <- subset(tweets, select = c(user.id, text) )
corpTW <- corpus(tw_short, text_field = "text")

#build dtm
dtmTW <- corpTW %>% 
  tokens(remove_punct = TRUE,
         remove_url = TRUE, #remove_twitter = TRUE, 
         remove_numbers = TRUE,
         split_hyphens = TRUE,
         remove_symbols = TRUE,
         include_docvars = TRUE) %>% 
  tokens_select(pattern = c(stopwords("de")), selection = "remove") %>% 
  dfm() %>% 
  dfm_group(groups = user.id) %>% 
  dfm_lookup(dictionary = myDict)
mentionsTW <- convert(dtmTW, to = "data.frame") %>% 
  rename(tw_regionalization = regionalization,
         user.id = doc_id)

#freq by account
tw_freq <- as.data.frame(table(tweets$user.id))
tw_freq$tw_active <- 1
tw_freq <- tw_freq %>% rename(user.id = Var1,
                              tw_freq = Freq) 

#merge regionalization and tw_freq
tw_freq <- left_join(tw_freq, mentionsTW, by = "user.id")

#merge with df
df <- merge(df, tw_freq, by.x = "tw_id", by.y = "user.id" , all.x = TRUE)


#---------------------------------------------#
## ---  GEOGRAPHIC REFERENCES              ####
#---------------------------------------------#

#* 1. create dictionary of geographic references ----

#load and preprocess data
BTW17_Gemeinden <- read_delim("Data/BTW17_WK_Gemeinden_noheader.csv", ";", 
                              escape_double = FALSE, col_types = cols(RGS_Land = col_skip(), 
                                                                      RGS_RegBez = col_skip(), RGS_Kreis = col_skip(),
                                                                      RGS_GemVerband = col_skip(), 
                                                                      RGS_Gemeinde = col_skip(), 
                                                                      `RegBez-Name` = col_skip(), 
                                                                      Gemeindeteil = col_skip(), 
                                                                      `Wahlkreis-von` = col_skip(), 
                                                                      `PLZ-mehrere` = col_skip()), trim_ws = TRUE)

#erase all commas including following signs
BTW17_Gemeinden$Gemeindename <- tolower(str_replace(BTW17_Gemeinden$Gemeindename,",.*$",""))
BTW17_Gemeinden$Gemeindename <- str_trim(BTW17_Gemeinden$Gemeindename, side = "both")

#function to create a vector as a dictionary for a given WK
create_dict <- function(WK){
  DictWK<- subset(BTW17_Gemeinden$Gemeindename, BTW17_Gemeinden$`Wahlkreis-Nr` == WK)
  return(DictWK)
}

#loop for assigning an individualized dictionary to each district (Wahlkreis)
DictList <- list()

for (i in 1:299){
  
  DictList[[i]] <- create_dict(i)
}

#create quanteda dictionary
names(DictList) <- paste0("WK", 1:299)
myDictWK <- dictionary(DictList)


#* 2. Facebook data ----

#build dtm 
dtmWK_FB <- corpFB %>% 
  tokens(remove_punct = TRUE,
         remove_url = TRUE, #remove_twitter = TRUE, 
         remove_numbers = TRUE,
         split_hyphens = TRUE,
         remove_symbols = TRUE,
         include_docvars = TRUE) %>% 
  tokens_select(pattern = c(stopwords("de")), selection = "remove") %>% 
  dfm() %>% 
  dfm_group(groups = page_id) %>% 
  dfm_lookup(dictionary = myDictWK)
mentionsWK_FB <- convert(dtmWK_FB, to = "data.frame") %>% 
  rename(page_id = doc_id)

dfWK_FB <- df %>%
  merge(mentionsWK_FB, by.x = "fb_id1", by.y = "page_id" , all.x = TRUE) %>%
  merge(mentionsWK_FB, by.x = "fb_id2", by.y = "page_id" , all.x = TRUE) %>%
  dplyr::select(-c(tw_id))

#loop to aggregate data from FB account 1 and FB account 2 (if present) AND
#assign district-related posts ONLY to MPs representing a given district
dfWK_FB$Posts2constituency <- NA
for (i in 1:299){
  variable_x = paste0("WK", i, ".x")
  variable_y = paste0("WK", i, ".y")
  dfWK_FB = dfWK_FB %>% 
    rowwise() %>% 
    mutate(new_var = case_when(
      !is.na(!!parse_quosure(variable_x)) | !is.na(!!parse_quosure(variable_y)) ~
        sum(!!parse_quosure(variable_x), !!parse_quosure(variable_y), na.rm = T)),
      Posts2constituency = case_when(constituency == i ~ new_var,
                                     TRUE ~ as.numeric(Posts2constituency)
      ))
}

#subset
dfWK_FB <- dfWK_FB %>%
  dplyr::select(unique_id, Posts2constituency)


#* 3. Twitter data ----

#build dtm
dtmWK_TW <- corpTW %>% 
  tokens(remove_punct = TRUE,
         remove_url = TRUE, #remove_twitter = TRUE, 
         remove_numbers = TRUE,
         split_hyphens = TRUE,
         remove_symbols = TRUE,
         include_docvars = TRUE) %>% 
  tokens_select(pattern = c(stopwords("de")), selection = "remove") %>% 
  dfm() %>% 
  dfm_group(groups = user.id) %>% 
  dfm_lookup(dictionary = myDictWK)
mentionsWK_TW <- convert(dtmWK_TW, to = "data.frame") %>% 
  rename(user.id = doc_id)

dfWK_TW <- df %>%
  merge(mentionsWK_TW, by.x = "tw_id", by.y = "user.id" , all.x = TRUE) 

#loop to assign district-related tweets ONLY to MPs representing a given district
dfWK_TW$Tweets2constituency <- NA
for (i in 1:299){
  variable_x = paste0("WK", i)
  dfWK_TW = dfWK_TW %>% 
    mutate(new_var = !!parse_quosure(variable_x),
           Tweets2constituency = case_when(constituency == i ~ new_var,
                                           TRUE ~ as.numeric(Tweets2constituency)
           ))
}

#subset
dfWK_TW <- dfWK_TW %>%
  dplyr::select(unique_id, Tweets2constituency)


#---------------------------------------------#
## ---  MERGE DATA AND ADD VARIABLES       ####
#---------------------------------------------#

#* 1. Merge with main data frame for analysis ----

df <- df %>% 
  as_tibble() %>% 
  mutate(
    # recode activity binary
    fb_active = ifelse(is.na(fb_active), 0, fb_active),
    tw_active = ifelse(is.na(tw_active), 0, tw_active),
    CDUCSU = ifelse(party == "CDU" | party == "CSU", 1, 0)
  ) %>% 
  dplyr::select(tw_id:age2017, renewed_candidacy, constituency, sex, party, state, isDC, isListed, CDUCSU,
                academic:tw_regionalization, -seats, -sd_discrepancy) %>% 
  merge(dfWK_TW) %>%
  merge(dfWK_FB) %>%
  mutate(fb_freq = as.numeric(fb_freq),
         tw_freq = as.numeric(tw_freq),
         partylistleader_0 = case_when(!is.na(partylistleader) ~ partylistleader,
                                       is.na(partylistleader) ~ 0),
         dual_candidacy = case_when(isDC == 1 & isListed == 1 ~ 1,
                                    isDC == 0 & isListed == 1 ~ 0,
                                    isDC == 1 & isListed == 0 ~ 0)) 

#* 2. Merge with dataset on followers, likes etc. ----
sm_metrics <- read_rds("Data/sm_metrics.rds")

df <- df %>%
  merge(sm_metrics[ , c("unique_id", "tw_ment_freq", "fb_pagelikes", "fb_comments", "tw_followers")], 
        by.x = "unique_id", by.y = "unique_id", all.x = T, all.y = T) %>%
  mutate(tw_ment_freq = as.numeric(tw_ment_freq),
         tw_ment_freq_0 = case_when(!is.na(tw_ment_freq) ~ tw_ment_freq,
                                    is.na(tw_ment_freq) ~ 0),
         fb_pagelikes_0 = case_when(!is.na(fb_pagelikes) ~ fb_pagelikes,
                                    is.na(fb_pagelikes) ~ 0),
         fb_comments_0 = case_when(!is.na(fb_comments) ~ fb_comments,
                                   is.na(fb_comments) ~ 0),
         tw_followers_0 = case_when(!is.na(tw_followers) ~ tw_followers,
                                    is.na(tw_followers) ~ 0))

#---------------------------------------------#
## ---  ADD MANIFESTO DATA                 ####
#---------------------------------------------#


manifesto_data <- readr::read_csv("Data/MPDataset_MPDS2021a.csv") %>%
  subset(countryname == "Germany" & date == "201709") %>%
  rename(national_way_of_life = per601,
         decentralization = per301) %>%
  dplyr::select(partyname, national_way_of_life, decentralization) %>%
  mutate(local_politics_index_additive = national_way_of_life+decentralization,
         local_politics_index_multiplicative = national_way_of_life*decentralization,
         party = case_when(partyname == "Alliance‘90/Greens" ~ "Greens",
                           partyname == "Alternative for Germany" ~ "AfD",
                           partyname == "Christian Democratic Union/Christian Social Union" ~ "CDU/CSU",
                           partyname == "Free Democratic Party" ~ "FDP",
                           partyname == "Social Democratic Party of Germany" ~ "SPD",
                           partyname == "The Left" ~ "Left party"))

df <- df  %>%
  merge(manifesto_data, by.x = "party", by.y = "party")  %>%
  mutate(safelist = case_when(viability == "safe" ~ 1,
                              viability != "safe" ~ 0,
                              is.na(viability) ~ 0)) %>%
  mutate(CDUCSU = case_when(party == "CDU/CSU" ~ 1,
                            party != "CDU/CSU" ~ 0))

#---------------------------------------------#
## ---  SAVE ALL DATA                      ####
#---------------------------------------------#

df %>%
  as_tibble() %>% 
  write_rds("Data/df_analysis.rds")



